notebook.community



In [178]:

    
import json
import codecs
import math
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
#from pylab import *
import seaborn as sns
from IPython.display import display, HTML 
%matplotlib inline

#from mpltools import style
#from mpltools import layout
#style.use('ggplot')

#load proposals
proposals = json.loads(codecs.open(os.path.join("data","proposals.json"),"r").read())



In [18]:

    
sessions_df = pd.DataFrame(proposals['sessions'])
sessions_df.head(2)









    Out[18]:






  
    
      
      agenda
      facilitators
      goals
      organization
      outcomes
      scale
      theme
      themeSlug
      timestamp
      title
    
  
  
    
      0
       #Hackers techniques: Social Engineering , Tool...
       [{u'twitter': u'@BIG_MIGGY', u'name': u'Mohamm...
       They will be well known about the latest hacks...
       Mozilla Jordan community
       People will be aware of what happens around th...
       I will give them everything i have, everyone m...
             Policy & Advocacy
       policy
       2014-09-11T12:16:43.978Z
              Don't Hack Me
    
    
      1
       I'd showcase an application built using the st...
       [{u'twitter': u'@tpiros', u'name': u'Tamas Pir...
       The session would discuss the usefulness of th...
                               
       I am the owner of the MEAN Stack meetup group ...
       The biggest challenge would be to make sure th...
       Build and Teach the Web
        teach
       2014-09-11T12:16:43.978Z
       Be Smarter, Get MEAN



In [19]:

    
themes_df = pd.DataFrame(proposals['themes'])
themes_df.head()









    Out[19]:






  
    
      
      description
      name
      slug
      totalProposals
    
  
  
    
      0
       Keep the web wild through hands-on making with...
          Build and Teach the Web
            teach
       135
    
    
      1
       Escape the limitations of your computer and bu...
             Open Web With Things
         physical
        44
    
    
      2
       Explore opportunities in the booming world of ...
               Web in Your Pocket
           mobile
        26
    
    
      3
       Design next-generation web solutions to solve ...
       Source Code for Journalism
       journalism
        42
    
    
      4
       Examine the potential of the open web to re-de...
         Open Science and the Web
          science
        45

How Many Proposals Did MozFest 2014 Receive?



In [8]:

    
display(HTML("<p>This year, the Mozilla Festival received {0} proposals in {1} areas.</p>".format(len(proposals['sessions']), len(proposals['themes']))))









    




This year, the Mozilla Festival received 578 proposals in 11 areas.



In [58]:

    
themes_df.sort('totalProposals',ascending=True).plot('name','totalProposals',
                                                     kind='barh',
                                                     title='Number of proposals per Topic, #MozFest 2014')









    Out[58]:





<matplotlib.axes.AxesSubplot at 0x10eecfe50>

How Many Organizations Proposed Sessions at MozFest?



In [68]:

    
sessions_df.head()
sessions_df = sessions_df.replace(to_replace="Open Knowledge/ School of Data",value="Open Knowledge")
sessions_df[sessions_df['organization'] == "Open Knowledge"]









    Out[68]:






  
    
      
      agenda
      facilitators
      goals
      organization
      outcomes
      scale
      theme
      themeSlug
      timestamp
      title
    
  
  
    
      481
       We need 2 hours to ensure people have sufficie...
       [{u'twitter': u'@milena_iul', u'name': u'Milen...
       Spreadsheets can be a your best friend: they c...
       Open Knowledge
       We are looking to document the session in the ...
       We will break people down in small groups by t...
            Open Science and the Web
       science
       2014-09-11T12:16:44.039Z
                     Become a spreadsheet pro
    
    
      483
       We’ll split the crowd in small groups that wil...
       [{u'twitter': u'@beatricemartini', u'name': u'...
       Digital and open communities have taken multip...
       Open Knowledge
       After the session, we will collect all the ide...
       The overall idea is to have small groups of pe...
       (Community) Policy & Advocacy
           NaN
       2014-09-11T12:16:44.039Z
       Join! Types of diversity and inclusion
    
    
      493
       We need 2 hours to ensure participants have su...
       [{u'twitter': u'@miena_iul', u'name': u'Milena...
       We’ve all heard of “horror stories” where data...
       Open Knowledge
       We are looking to document the session in the ...
       We plan to work in small groups of 5-6 partici...
            Open Science and the Web
       science
       2014-09-11T12:16:44.040Z
                      Dealing with messy data
    
    
      554
       Active session  - \nA station of ideas to prom...
       [{u'twitter': u'@morchickit', u'name': u'Mor R...
       This is the third year we run the Global Open ...
       Open Knowledge
       We will post blog post about the session. Also...
       With 5-10 participants we can create a more in...
                           Open Data
          data
       2014-09-11T12:16:44.048Z
                Help us measure Open Gov Data



In [88]:

    
sessions_gb = sessions_df.groupby('organization')
org_count = sessions_gb.aggregate({'organization':len}).sort('organization',ascending=False)
org_count.head()









    Out[88]:






  
    
      
      organization
    
    
      organization
      
    
  
  
    
      
       102
    
    
      Mozilla
        28
    
    
      Northwestern University Knight Lab
        13
    
    
      mozillian
        11
    
    
      Mozilla Reps
         8



In [78]:

    
display(HTML("<p>This year, {0} organizations proposed sessions, with {1} orgs proposing more than one session.</p>".format(len(org_count)-1,len(org_count[org_count['organization'] > 1]))))









    




This year, 335 organizations proposed sessions, with 62 orgs proposing more than one session.



In [85]:

    
display(HTML("<p>{0} sessions, or {1:.2f}% of all sessions had no organization listed. </p>".format(org_count.ix['']['organization'],100*org_count.ix['']['organization']/float(len(sessions_df)))))









    




102 sessions, or 17.65% of all sessions had no organization listed.



In [104]:

    
org_count[org_count['organization'] > 2].sort('organization',ascending=True).plot(kind='barh',figsize=(3,6),title='Proposals from orgs with >2 proposals, #MozFest 2014')
plt.legend('')
plt.ylabel('')
plt.xlabel('Submissions')









    Out[104]:





<matplotlib.text.Text at 0x1151d9790>



In [159]:

    
def _uniques(Series):
    return len(set(Series))

def _avg_count(Series):
    return np.mean(Series.apply(len))

sessions_gb_theme = sessions_df.groupby('themeSlug')
theme_count_df = sessions_gb_theme.aggregate({'organization':_uniques,'title':len,'facilitators':_avg_count})
theme_count_df









    Out[159]:






  
    
      
      organization
      facilitators
      title
    
    
      themeSlug
      
      
      
    
  
  
    
      art
       38
       1.773585
        53
    
    
      badges
       17
       2.833333
        24
    
    
      data
       30
       2.175000
        40
    
    
      hive
       10
       2.000000
        14
    
    
      journalism
       33
       1.880952
        42
    
    
      mobile
       17
       1.923077
        26
    
    
      music
       13
       2.117647
        17
    
    
      physical
       37
       1.954545
        44
    
    
      policy
       42
       2.491228
        57
    
    
      science
       36
       2.755556
        45
    
    
      teach
       80
       2.059259
       135



In [176]:

    
# For some reason the colormap isn't working
ax = theme_count_df.plot(x='title',y='organization',
                    kind='scatter',
                    colormap='autumn_r',
                    c=theme_count_df['facilitators'],
                    s=20*np.e**theme_count_df['facilitators'],
                    figsize=(8,4))

for i in theme_count_df.index:
    x = theme_count_df.ix[i]['title']
    y = theme_count_df.ix[i]['organization']
    plt.annotate(
        i, 
        xy = (x, y), xytext = (-20, 20),
        textcoords = 'offset points', ha = 'right', va = 'bottom',
        bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5),
        arrowprops = dict(arrowstyle = '->',color='black', connectionstyle = 'arc3,rad=0'))
    
plt.xlabel('Number of Sessions Proposed per Theme')
plt.ylabel('Number of Proposing Organizations per Theme')
title('Number of Session Proposals and Proposing Organizations by Theme, #MozFest 2014\n (the color of dots and number on labels is the avg session facilitator count)')









    Out[176]:





<matplotlib.text.Text at 0x1185b60d0>



In [179]:

    
from byline_gender import BylineGender

bg = BylineGender()
people = {}
for session in proposals['sessions']:
    org = session['organization'] #not necessarily membership, maybe collaboration
    for facilitator in session['facilitators']:
        name = facilitator['name']
        twitter = facilitator['twitter']
        if(len(name)==0 and len(twitter)>0):
            name = twitter
        if name not in people.keys():
            people[name]={"sessions":[],"orgs":[],"twitter":[],"gender":None}
        people[name]['sessions'].append(session)
        if len(org)>0 and org not in people[name]['orgs']:
            people[name]['orgs'].append(org)
        if len(twitter)>0 and twitter not in people[name]['twitter']:
            people[name]['twitter'].append(twitter)









    



---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-179-7d1baed1357e> in <module>()
----> 1 from byline_gender import BylineGender
      2 
      3 bg = BylineGender()
      4 people = {}
      5 for session in proposals['sessions']:

/Users/brianckeegan/Dropbox/Hacking/conference-proposers/byline_gender.py in <module>()
----> 1 from gender_detector import GenderDetector
      2 import nltk
      3 import re
      4 import requests
      5 import StringIO

ImportError: No module named gender_detector



In [5]:

    
import unicodedata
import string

def get_org_name(org,name):
    asciiname = unicodedata.normalize('NFKD', name).encode('ascii','ignore')
    asciiname = ''.join(ch for ch in asciiname if ch not in string.punctuation)
    org = ''.join(ch for ch in org if ch not in string.punctuation)
    return (org,asciiname)


from byline_gender import BylineGender
bg = BylineGender()
people = {}
for session in proposals['sessions']:
    org = session['organization'] #not necessarily membership, maybe collaboration
    for facilitator in session['facilitators']:
        name = facilitator['name']
        twitter = facilitator['twitter']
        if(len(name)==0 and len(twitter)>0):
            name = twitter
        if name not in people.keys():
            people[name]={"sessions":[],"orgs":[],"twitter":[],"gender":None}
        people[name]['sessions'].append(session)
        if len(org)>0 and org not in people[name]['orgs']:
            people[name]['orgs'].append(org)
        if len(twitter)>0 and twitter not in people[name]['twitter']:
            people[name]['twitter'].append(twitter) 

#ONE TIME ONLY: Generate Name CSV to import to Google Spreadsheets
#TODO: SAVE ACTUAL ASCII NAMES AND ORG NAMES
#orgnames = {}
#for name in people.keys():
#    if len(name)==0:
#        continue
#    person = people[name]
#    if(len(person['orgs'])>0):
#        org = person['orgs'][0]
#    elif(len(person['twitter'])>0):
#        org = person['twitter'][0]
#    #alas, the python version of Open Gender Tracker is not unicode safe :p
#    #asciiname = unicodedata.normalize('NFKD', name).encode('ascii','ignore')
#    #asciiname = ''.join(ch for ch in asciiname if ch not in string.punctuation)
#    #org = ''.join(ch for ch in org if ch not in string.punctuation)
#    org,asciiname = get_org_name(org,name)
#    if org not in orgnames.keys():
#        orgnames[org]={}
#    if asciiname not in orgnames[org].keys():
#        orgnames[org][asciiname] = len(person['sessions'])

#f = codecs.open("mozfest_org_names.csv","w", "utf-8")
#bg.export_org_names(orgnames,f)
#f.close()



In [35]:

    
#GENERATE A DATASET OF GENDER PER THEME

theme_people = {}

for theme in proposals['themes']:
    slug = theme['slug']
    #filter by sessions that have the current theme
    sessions = [x for x in proposals['sessions'] if 'themeSlug' in x.keys() and x['themeSlug'] == slug]
    #initialize hash
    if slug not in theme_people.keys():
        theme_people[slug] ={"facilitators":[],
                             "inclusive":{"female":0,"male":0,"unknown":0,"total":0},
                             "unique":{"female":0,"male":0,"unknown":0,"total":0},
                             }    
    for session in sessions:
        org = ""
        if(len(session['organization'])>0):
            org = session['organization']
        for person in session['facilitators']:
            if(len(org)==0 and len(person['twitter'])>0):
                org = person['twitter']
            name = person['name']
            
            asciiname = ''.join(ch for ch in name if ch not in string.punctuation)
            org = ''.join(ch for ch in org if ch not in string.punctuation)
            
            inferred_gender = bg.org_name_gender(org,asciiname)
            if(inferred_gender != "ignore"):
                theme_people[slug]['inclusive'][inferred_gender] += 1
                theme_people[slug]['inclusive']['total']+=1
                if name not in theme_people[slug]['facilitators']:
                    theme_people[slug]['unique'][inferred_gender]+=1
                    theme_people[slug]['unique']['total']+=1
                    theme_people[slug]['facilitators'].append(name)

#generate gender specific series

def pct(a,b):
    return 100.*(float(a)/float(b))

themes = [x[0] for x in sorted([(x,theme_people[x]['inclusive']) for x in theme_people.keys()],key=lambda x: x[1],reverse=True)]
female = {"unique":[],"inclusive":[],'unique_pct':[],'inclusive_pct':[]}
male = {"unique":[],"inclusive":[],'unique_pct':[],'inclusive_pct':[]}
unknown = {"unique":[],"inclusive":[],'unique_pct':[],'inclusive_pct':[]}
unknown_bottom = {'unique':[], 'inclusive':[],'unique_pct':[],'inclusive_pct':[]}
for theme in themes:

    for k in ['unique','inclusive']:
        female[k].append(theme_people[theme][k]['female'])
        male[k].append(theme_people[theme][k]['male'])
        unknown[k].append(theme_people[theme][k]['unknown'])
        unknown_bottom[k].append(female[k][-1] + male[k][-1])

        
        female[k+"_pct"].append(pct(female[k][-1],theme_people[theme][k]['total']))
        male[k+"_pct"].append(pct(male[k][-1],theme_people[theme][k]['total']))
        unknown[k+"_pct"].append(pct(unknown[k][-1],theme_people[theme][k]['total']))
        unknown_bottom[k+'_pct'].append(female[k+"_pct"][-1] + male[k+"_pct"][-1])



In [7]:

    
sum([len(theme_people[x]['facilitators']) for x in theme_people.keys()])









    Out[7]:





924



In [8]:

    
ind = np.arange(len(themes))
width = 0.4       # the width of the bars: can also be len(x) sequence  

for j in ['unique','inclusive']:
    for a in ['','_pct']:
        k = j+a
        fig = plt.figure(figsize=(14, 8))   
        ax = fig.add_subplot(111)
        ax = fig.add_subplot(111)

        #print "{0},{1},{2}".format(len(female[k]),len(male[k]),len(unknown_bottom[k]))

        p1 = ax.bar(ind, female[k],   width, color='#48C8B8')
        p2 = ax.bar(ind, male[k], width, color='#E8CA33',
                     bottom=female[k])
        p3 = ax.bar(ind, unknown[k], width, color='#cccccc',
                     bottom=unknown_bottom[k])
        plt.xticks(ind+width/2.)
        if(a =="_pct"):
            plt.yticks(np.arange(0,101,10))
            plt.ylim(ymax = 100, ymin = 0)
        ax.set_xticklabels(themes, rotation=45, fontsize=18,ha='center')

        plt.ylabel("Number of {0} session facilitators".format(j))
        plt.title("Inferred Sex of #MozFest 2014 proposed session facilitators ({0})\n".format(j), fontsize=16)
        plt.legend( (p1[0], p2[0],p3[0]), ('Female', 'Male',"Unknown"), bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0. )

        plt.show()



In [ ]:

    
# ADD GENDER TO SESSION HASH
for i in np.arange(0,len(proposals['sessions'])):
    proposals['sessions'][i][u'gender']={u'female':0,u'male':0,u'unknown':0,u'total':0}
    for person in proposals['sessions'][i]['facilitators']:
        name = person['name']
        org = proposals['sessions'][i]['organization']
        asciiname = ''.join(ch for ch in name if ch not in string.punctuation)
        org = ''.join(ch for ch in org if ch not in string.punctuation)
        inferred_gender = bg.org_name_gender(org,asciiname)
        if(inferred_gender != "ignore"):
            proposals['sessions'][i][u'gender'][inferred_gender]+=1
            proposals['sessions'][i][u'gender'][u'total']+=1



In [37]:

    
f = codecs.open("mozilla_session_gender.csv","w","utf8")
f.write(','.join(["slug","org","title","female","male","unknown","total"])+"\n")
for session in proposals['sessions']:
    if 'themeSlug' in session.keys() and 'organization' in session.keys():
        org = ''.join(ch for ch in session['organization'] if ch not in string.punctuation)
        title = ''.join(ch for ch in session['title'] if ch not in string.punctuation)
        gender = session['gender']
        f.write(','.join([session['themeSlug'],org,title,str(gender[u'female']),str(gender[u'male']),str(gender[u'unknown']),str(gender[u'total'])]) + "\n")
f.close()



In [ ]:

	agenda	facilitators	goals	organization	outcomes	scale	theme	themeSlug	timestamp	title
0	#Hackers techniques: Social Engineering , Tool...	[{u'twitter': u'@BIG_MIGGY', u'name': u'Mohamm...	They will be well known about the latest hacks...	Mozilla Jordan community	People will be aware of what happens around th...	I will give them everything i have, everyone m...	Policy & Advocacy	policy	2014-09-11T12:16:43.978Z	Don't Hack Me
1	I'd showcase an application built using the st...	[{u'twitter': u'@tpiros', u'name': u'Tamas Pir...	The session would discuss the usefulness of th...		I am the owner of the MEAN Stack meetup group ...	The biggest challenge would be to make sure th...	Build and Teach the Web	teach	2014-09-11T12:16:43.978Z	Be Smarter, Get MEAN

	description	name	slug	totalProposals
0	Keep the web wild through hands-on making with...	Build and Teach the Web	teach	135
1	Escape the limitations of your computer and bu...	Open Web With Things	physical	44
2	Explore opportunities in the booming world of ...	Web in Your Pocket	mobile	26
3	Design next-generation web solutions to solve ...	Source Code for Journalism	journalism	42
4	Examine the potential of the open web to re-de...	Open Science and the Web	science	45

	agenda	facilitators	goals	organization	outcomes	scale	theme	themeSlug	timestamp	title
481	We need 2 hours to ensure people have sufficie...	[{u'twitter': u'@milena_iul', u'name': u'Milen...	Spreadsheets can be a your best friend: they c...	Open Knowledge	We are looking to document the session in the ...	We will break people down in small groups by t...	Open Science and the Web	science	2014-09-11T12:16:44.039Z	Become a spreadsheet pro
483	We’ll split the crowd in small groups that wil...	[{u'twitter': u'@beatricemartini', u'name': u'...	Digital and open communities have taken multip...	Open Knowledge	After the session, we will collect all the ide...	The overall idea is to have small groups of pe...	(Community) Policy & Advocacy	NaN	2014-09-11T12:16:44.039Z	Join! Types of diversity and inclusion
493	We need 2 hours to ensure participants have su...	[{u'twitter': u'@miena_iul', u'name': u'Milena...	We’ve all heard of “horror stories” where data...	Open Knowledge	We are looking to document the session in the ...	We plan to work in small groups of 5-6 partici...	Open Science and the Web	science	2014-09-11T12:16:44.040Z	Dealing with messy data
554	Active session - \nA station of ideas to prom...	[{u'twitter': u'@morchickit', u'name': u'Mor R...	This is the third year we run the Global Open ...	Open Knowledge	We will post blog post about the session. Also...	With 5-10 participants we can create a more in...	Open Data	data	2014-09-11T12:16:44.048Z	Help us measure Open Gov Data

	organization
organization
	102
Mozilla	28
Northwestern University Knight Lab	13
mozillian	11
Mozilla Reps	8

	organization	facilitators	title
themeSlug
art	38	1.773585	53
badges	17	2.833333	24
data	30	2.175000	40
hive	10	2.000000	14
journalism	33	1.880952	42
mobile	17	1.923077	26
music	13	2.117647	17
physical	37	1.954545	44
policy	42	2.491228	57
science	36	2.755556	45
teach	80	2.059259	135